library(readxl)
sm <- read_excel("/Users/user/Downloads/social_media_cleaned.xlsx")
str(sm)
## tibble [21 × 10] (S3: tbl_df/tbl/data.frame)
##  $ character                    : chr [1:21] "masinl" "peace" "Patty" "Bunny" ...
##  $ Instagram                    : num [1:21] 3.5 7.73 3.77 5.38 12 ...
##  $ LinkedIn                     : num [1:21] 4 5.2 7 5.317 0.583 ...
##  $ SnapChat                     : num [1:21] 1 3.683 0.533 1.3 0 ...
##  $ Twitter                      : num [1:21] 5 0 0 0 0.667 ...
##  $ Whatsapp/Wechat              : num [1:21] 1 4.18 9.83 5.3 3 ...
##  $ youtube                      : num [1:21] 2.5 4.25 1.85 2 3.5 7 3 2 4 3 ...
##  $ OTT                          : num [1:21] 14.5 0 2 2 2 3 0 3 3 0 ...
##  $ Reddit                       : num [1:21] 2.5 0 0 0 1 0 0 0 0 0 ...
##  $ How you felt the entire week?: num [1:21] 3 3 4 4 3 5 4 3 3 2 ...

#Dependent Variable: How you felt the entire week? #Independent Variables:Instagram,LinkedIn,SnapChat,Twitter,Whatsapp,youtube,OTT,Reddit

sm1 <- sm[, 2:9]
sm1
## # A tibble: 21 × 8
##    Instagram LinkedIn SnapChat Twitter `Whatsapp/Wechat` youtube   OTT Reddit
##        <dbl>    <dbl>    <dbl>   <dbl>             <dbl>   <dbl> <dbl>  <dbl>
##  1     3.5      4        1       5                  1       2.5   14.5    2.5
##  2     7.73     5.2      3.68    0                  4.18    4.25   0      0  
##  3     3.77     7        0.533   0                  9.83    1.85   2      0  
##  4     5.38     5.32     1.3     0                  5.3     2      2      0  
##  5    12        0.583    0       0.667              3       3.5    2      1  
##  6     2.33     7        0.467   0                 12       7      3      0  
##  7     5.37     4        0       0                  6       3      0      0  
##  8     7        4        3       0                 10       2      3      0  
##  9     8.65    10        3.83    0                  6.15    4      3      0  
## 10     0.167    0        0       0                  1       3      0      0  
## # ℹ 11 more rows
#load necessary libraries
library(magrittr)
library(NbClust)
library(cluster)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Cluster Analysis

#Hierarchical Clustering- Dendrogram
sm_scaled <- scale(sm1)
dist_matrix <- dist(sm_scaled)

#Clustering Single
hc <- hclust(dist_matrix,method = "single")
fviz_dend(hc)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Default Clustering

hc <- hclust(dist_matrix)
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")

#Average Clustering

hc <- hclust(dist_matrix,method = "average")
plot(hc, hang = -1, cex = 0.6, main = "Dendrogram for Hierarchical Clustering")

#By observing the above dendrogram’s k=2 clusters will be sufficient.This is confirmed further with D index graphical representation.

num_clusters <- 2
clusters <- cutree(hc, k = num_clusters)

# Membership for each cluster
table(clusters)
## clusters
##  1  2 
##  1 20
# Principal Components
pca_result <- prcomp(sm1,scale=TRUE)
pca_result
## Standard deviations (1, .., p=8):
## [1] 1.4936735 1.3063036 1.1303414 0.9384308 0.9007674 0.7694819 0.6078953
## [8] 0.3621675
## 
## Rotation (n x k) = (8 x 8):
##                         PC1         PC2          PC3         PC4         PC5
## Instagram        0.43537868 -0.15356106  0.377944297  0.03258545 -0.36643468
## LinkedIn         0.35649468 -0.20991651 -0.329259391 -0.19907722  0.70255046
## SnapChat         0.15947567  0.03392372  0.717791182  0.10281712  0.51019404
## Twitter         -0.39046510 -0.53815939  0.040482193 -0.33406772 -0.09484716
## Whatsapp/Wechat  0.52694693  0.06079220  0.007167203  0.09132133 -0.28308505
## youtube          0.45370070 -0.20727847 -0.406762139 -0.03615301 -0.09453089
## OTT              0.08162766 -0.68423619  0.216480569 -0.12793328 -0.07632718
## Reddit          -0.12361633 -0.35601755 -0.139687974  0.90062217  0.08919336
##                         PC6         PC7         PC8
## Instagram       -0.51184307 -0.48893799 -0.08742222
## LinkedIn         0.02630366 -0.42220705 -0.09096859
## SnapChat        -0.04887482  0.39304838 -0.17449187
## Twitter         -0.02585534  0.04614556 -0.65794190
## Whatsapp/Wechat  0.68926770  0.02263817 -0.39306576
## youtube         -0.39774620  0.64519354 -0.03187491
## OTT              0.31710521  0.06567287  0.59264873
## Reddit          -0.02048963 -0.07045776 -0.11831273
#Non-Hierarchical Clustering(k-means)
num_clusters <- 2
kmeans_model <- kmeans(sm1, centers = num_clusters)

# Membership for each cluster
table(kmeans_model$cluster)
## 
##  1  2 
## 17  4
# Visualize cluster and membership using first two Principal Components
fviz_cluster(list(data = pca_result$x[, 1:2], cluster = kmeans_model$cluster))

# Visualize cluster centers for k-means
fviz_cluster(kmeans_model, data = sm_scaled, geom = "point", frame.type = "convex", 
             pointsize = 2, fill = "white", main = "K-means Cluster Centers")
## Warning: argument frame is deprecated; please use ellipse instead.
## Warning: argument frame.type is deprecated; please use ellipse.type instead.

# Visualize cluster and membership using first two Principal Components for k-means
pca_result <- prcomp(sm_scaled, scale = TRUE)
fviz_cluster(kmeans_model, data = pca_result$x[, 1:2], geom = "point", 
             pointsize = 2, fill = "white", main = "K-means Clustering Result (PCA)")

# Calculate silhouette information for k-means clustering
sil <- silhouette(kmeans_model$cluster, dist(sm_scaled))

# Visualize the silhouette plot for k-means clustering
fviz_silhouette(sil, main = "Silhouette Plot for K-means Clustering")
##   cluster size ave.sil.width
## 1       1   17          0.13
## 2       2    4          0.07

#optimal cluster method/visualization
res.nbclust <- sm1 %>% scale() %>% NbClust(distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index ="all") 
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 2 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 4 proposed 8 as the best number of clusters 
## * 4 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************

#From cluster analysis I am able to recognize users whose social media usage patterns are similar to mine.

PCA

#Get the correlations between the variables 

cor(sm1, use = "complete.obs")
##                   Instagram     LinkedIn    SnapChat    Twitter Whatsapp/Wechat
## Instagram        1.00000000  0.097056399  0.28968877 -0.1930565       0.3776962
## LinkedIn         0.09705640  1.000000000  0.02552545 -0.1300685       0.2288356
## SnapChat         0.28968877  0.025525452  1.00000000 -0.1799569       0.0809998
## Twitter         -0.19305653 -0.130068464 -0.17995686  1.0000000      -0.4958329
## Whatsapp/Wechat  0.37769615  0.228835623  0.08099980 -0.4958329       1.0000000
## youtube          0.33000187  0.452197669 -0.16006877 -0.1881777       0.3716852
## OTT              0.26738122  0.185492527  0.13158590  0.5570740       0.1336204
## Reddit          -0.07461553 -0.006992884 -0.08116237  0.1649030      -0.1344497
##                     youtube       OTT       Reddit
## Instagram        0.33000187 0.2673812 -0.074615529
## LinkedIn         0.45219767 0.1854925 -0.006992884
## SnapChat        -0.16006877 0.1315859 -0.081162369
## Twitter         -0.18817769 0.5570740  0.164902964
## Whatsapp/Wechat  0.37168516 0.1336204 -0.134449660
## youtube          1.00000000 0.1605652  0.026399913
## OTT              0.16056523 1.0000000  0.232791099
## Reddit           0.02639991 0.2327911  1.000000000
#Computing Principal Components
social_pca <- prcomp(sm1,scale=TRUE)
social_pca
## Standard deviations (1, .., p=8):
## [1] 1.4936735 1.3063036 1.1303414 0.9384308 0.9007674 0.7694819 0.6078953
## [8] 0.3621675
## 
## Rotation (n x k) = (8 x 8):
##                         PC1         PC2          PC3         PC4         PC5
## Instagram        0.43537868 -0.15356106  0.377944297  0.03258545 -0.36643468
## LinkedIn         0.35649468 -0.20991651 -0.329259391 -0.19907722  0.70255046
## SnapChat         0.15947567  0.03392372  0.717791182  0.10281712  0.51019404
## Twitter         -0.39046510 -0.53815939  0.040482193 -0.33406772 -0.09484716
## Whatsapp/Wechat  0.52694693  0.06079220  0.007167203  0.09132133 -0.28308505
## youtube          0.45370070 -0.20727847 -0.406762139 -0.03615301 -0.09453089
## OTT              0.08162766 -0.68423619  0.216480569 -0.12793328 -0.07632718
## Reddit          -0.12361633 -0.35601755 -0.139687974  0.90062217  0.08919336
##                         PC6         PC7         PC8
## Instagram       -0.51184307 -0.48893799 -0.08742222
## LinkedIn         0.02630366 -0.42220705 -0.09096859
## SnapChat        -0.04887482  0.39304838 -0.17449187
## Twitter         -0.02585534  0.04614556 -0.65794190
## Whatsapp/Wechat  0.68926770  0.02263817 -0.39306576
## youtube         -0.39774620  0.64519354 -0.03187491
## OTT              0.31710521  0.06567287  0.59264873
## Reddit          -0.02048963 -0.07045776 -0.11831273
summary(social_pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion  0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
##                           PC8
## Standard deviation     0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion  1.0000
eigen_social<- social_pca$sdev^2
eigen_social
## [1] 2.2310606 1.7064291 1.2776717 0.8806523 0.8113819 0.5921023 0.3695367
## [8] 0.1311653

#From PCA variate representation of each PC, It’s evident that PC1 and PC2 add arround 50% of the to total variance.

#Screeplot

plot(eigen_social, xlab = "Component number", ylab = "Component variance", type = "l", main = "Scree diagram")

plot(log(eigen_social), xlab = "Component number", ylab = "Component variance", type = "l", main = "Scree diagram")

library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.2
res.pca <- PCA(sm_scaled, graph = FALSE)
fviz_eig(res.pca, addlabels = TRUE)

###From the screeplot elbow it is benefial to consider PC1,PC2,PC3,PC4,PC5,PC6 as it covers 93% of total variance.

#Visualization using PC’s

library(FactoMineR)
library("factoextra")
res.pca <- PCA(sm1, graph = FALSE)
fviz_pca_var(res.pca, col.var = "black")

#From the above I can tell that most of my classmates usage timings of the apps Instagram,Whatsapp/Wechat, LinkedIn, Youtube and snapchat are similar. Most probably, twitter and reddit are not used by me and my classmates.

Factor Analysis

# load library for factor analysis
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

#Decide how many Factors are ideal for your dataset?

fa.parallel(sm1)

## Parallel analysis suggests that the number of factors =  0  and the number of components =  0

#Parallel analysis suggests that the number of factors = 0 and the number of components = 0

#Explain the output for your factor model?

fit.pc <- principal(sm1, nfactors=2, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                   RC1   RC2    h2   u2 com
## Instagram        0.68  0.01 0.463 0.54 1.0
## LinkedIn         0.59  0.11 0.359 0.64 1.1
## SnapChat         0.22 -0.11 0.059 0.94 1.5
## Twitter         -0.36  0.84 0.834 0.17 1.4
## Whatsapp/Wechat  0.73 -0.30 0.626 0.37 1.3
## youtube          0.73  0.07 0.533 0.47 1.0
## OTT              0.37  0.82 0.814 0.19 1.4
## Reddit          -0.04  0.50 0.250 0.75 1.0
## 
##                        RC1  RC2
## SS loadings           2.19 1.75
## Proportion Var        0.27 0.22
## Cumulative Var        0.27 0.49
## Proportion Explained  0.56 0.44
## Cumulative Proportion 0.56 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.13 
##  with the empirical chi square  21.06  with prob <  0.072 
## 
## Fit based upon off diagonal values = 0.71

#High absolute values (close to 1) indicate a strong relationship between the variable and the factor. #h2 explains how much variance of the variables are explained by the factors. #u2 indicates the amount of variance not explained by the factors Principal Components Analysis Call: principal(r = sm1, nfactors = 2, rotate = “varimax”) Standardized loadings (pattern matrix) based upon correlation matrix

                   RC1  RC2

SS loadings 2.27 1.80 Proportion Var 0.25 0.20 Cumulative Var 0.25 0.45 Proportion Explained 0.56 0.44 Cumulative Proportion 0.56 1.00

Mean item complexity = 1.3 Test of the hypothesis that 2 components are sufficient.

The root mean square of the residuals (RMSR) is 0.14 with the empirical chi square 29.01 with prob < 0.066

round(fit.pc$values, 3)
## [1] 2.231 1.706 1.278 0.881 0.811 0.592 0.370 0.131
fit.pc$loadings
## 
## Loadings:
##                 RC1    RC2   
## Instagram        0.681       
## LinkedIn         0.589  0.111
## SnapChat         0.216 -0.110
## Twitter         -0.359  0.840
## Whatsapp/Wechat  0.732 -0.300
## youtube          0.727       
## OTT              0.371  0.822
## Reddit                  0.498
## 
##                  RC1   RC2
## SS loadings    2.189 1.749
## Proportion Var 0.274 0.219
## Cumulative Var 0.274 0.492
# Communalities
fit.pc$communality
##       Instagram        LinkedIn        SnapChat         Twitter Whatsapp/Wechat 
##      0.46314710      0.35873574      0.05870521      0.83436255      0.62581187 
##         youtube             OTT          Reddit 
##      0.53256680      0.81378028      0.25038016
# Rotated factor scores, Notice the columns ordering: RC1, RC2
fit.pc
## Principal Components Analysis
## Call: principal(r = sm1, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                   RC1   RC2    h2   u2 com
## Instagram        0.68  0.01 0.463 0.54 1.0
## LinkedIn         0.59  0.11 0.359 0.64 1.1
## SnapChat         0.22 -0.11 0.059 0.94 1.5
## Twitter         -0.36  0.84 0.834 0.17 1.4
## Whatsapp/Wechat  0.73 -0.30 0.626 0.37 1.3
## youtube          0.73  0.07 0.533 0.47 1.0
## OTT              0.37  0.82 0.814 0.19 1.4
## Reddit          -0.04  0.50 0.250 0.75 1.0
## 
##                        RC1  RC2
## SS loadings           2.19 1.75
## Proportion Var        0.27 0.22
## Cumulative Var        0.27 0.49
## Proportion Explained  0.56 0.44
## Cumulative Proportion 0.56 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.13 
##  with the empirical chi square  21.06  with prob <  0.072 
## 
## Fit based upon off diagonal values = 0.71
fit.pc$scores
##               RC1         RC2
##  [1,] -0.49593354  3.81935345
##  [2,]  0.37453109 -0.45810128
##  [3,]  0.24035060 -0.39724117
##  [4,] -0.11821501 -0.28545411
##  [5,] -0.05535776  0.18730345
##  [6,]  1.33982873 -0.15394397
##  [7,] -0.16544326 -0.57370308
##  [8,]  0.36239954 -0.37616101
##  [9,]  1.27403174  0.04737682
## [10,] -1.47503111 -0.57670717
## [11,] -1.47090422  0.33142379
## [12,] -0.99564293  0.65382335
## [13,] -0.64587802 -0.84729946
## [14,] -0.54403365 -0.04349169
## [15,] -0.05025441  0.89199339
## [16,]  0.46518890 -0.19297427
## [17,]  0.63875357 -0.56313735
## [18,] -0.42630523 -0.44123138
## [19,]  0.50350291 -0.59787800
## [20,]  2.63240375  0.51258979
## [21,] -1.38799166 -0.93654010
fa.plot(fit.pc) # See Correlations within Factors

Show the columns that go into each factor?

fa.diagram(fit.pc) # Visualize the relationship

Perform some visualizations using the factors

#very simple structure visualization
vss(sm1)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect.  Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## 
## Very Simple Structure
## Call: vss(x = sm1)
## VSS complexity 1 achieves a maximimum of 0.65  with  5  factors
## VSS complexity 2 achieves a maximimum of 0.78  with  7  factors
## 
## The Velicer MAP achieves a minimum of 0.07  with  1  factors 
## BIC achieves a minimum of  -35.54  with  1  factors
## Sample Size adjusted BIC achieves a minimum of  1.7  with  4  factors
## 
## Statistics by number of factors 
##   vss1 vss2   map dof   chisq prob sqresid  fit RMSEA   BIC SABIC complex
## 1 0.39 0.00 0.071  20 2.5e+01 0.19     7.0 0.39   0.1 -35.5  26.2     1.0
## 2 0.52 0.64 0.090  13 8.9e+00 0.78     4.2 0.64   0.0 -30.7   9.4     1.2
## 3 0.55 0.69 0.117   7 4.7e+00 0.70     2.9 0.75   0.0 -16.6   5.0     1.5
## 4 0.60 0.74 0.180   2 1.6e+00 0.45     2.1 0.81   0.0  -4.5   1.7     1.5
## 5 0.65 0.78 0.271  -2 6.1e-01   NA     1.4 0.87    NA    NA    NA     1.3
## 6 0.62 0.78 0.443  -5 3.9e-10   NA     1.2 0.90    NA    NA    NA     1.5
## 7 0.62 0.78 1.000  -7 0.0e+00   NA     1.2 0.90    NA    NA    NA     1.5
## 8 0.62 0.78    NA  -8 0.0e+00   NA     1.2 0.90    NA    NA    NA     1.5
##    eChisq    SRMR eCRMS  eBIC
## 1 3.2e+01 1.7e-01 0.196 -28.5
## 2 9.2e+00 8.8e-02 0.130 -30.4
## 3 2.1e+00 4.2e-02 0.084 -19.2
## 4 7.9e-01 2.6e-02 0.097  -5.3
## 5 2.2e-01 1.4e-02    NA    NA
## 6 1.7e-10 3.8e-07    NA    NA
## 7 2.2e-18 4.3e-11    NA    NA
## 8 2.2e-18 4.3e-11    NA    NA

Very Simple Structure Call: vss(x = sm1) VSS complexity 1 achieves a maximimum of 0.61 with 6 factors VSS complexity 2 achieves a maximimum of 0.78 with 7 factors

The Velicer MAP achieves a minimum of 0.06 with 1 factors BIC achieves a minimum of -53.17 with 1 factors Sample Size adjusted BIC achieves a minimum of 1.47 with 5 factors

Statistics by number of factors

# Computing Correlation Matrix
corrm.social <- cor(sm1)
corrm.social
##                   Instagram     LinkedIn    SnapChat    Twitter Whatsapp/Wechat
## Instagram        1.00000000  0.097056399  0.28968877 -0.1930565       0.3776962
## LinkedIn         0.09705640  1.000000000  0.02552545 -0.1300685       0.2288356
## SnapChat         0.28968877  0.025525452  1.00000000 -0.1799569       0.0809998
## Twitter         -0.19305653 -0.130068464 -0.17995686  1.0000000      -0.4958329
## Whatsapp/Wechat  0.37769615  0.228835623  0.08099980 -0.4958329       1.0000000
## youtube          0.33000187  0.452197669 -0.16006877 -0.1881777       0.3716852
## OTT              0.26738122  0.185492527  0.13158590  0.5570740       0.1336204
## Reddit          -0.07461553 -0.006992884 -0.08116237  0.1649030      -0.1344497
##                     youtube       OTT       Reddit
## Instagram        0.33000187 0.2673812 -0.074615529
## LinkedIn         0.45219767 0.1854925 -0.006992884
## SnapChat        -0.16006877 0.1315859 -0.081162369
## Twitter         -0.18817769 0.5570740  0.164902964
## Whatsapp/Wechat  0.37168516 0.1336204 -0.134449660
## youtube          1.00000000 0.1605652  0.026399913
## OTT              0.16056523 1.0000000  0.232791099
## Reddit           0.02639991 0.2327911  1.000000000
plot(corrm.social)

social_pca <- prcomp(sm1, scale=TRUE)
summary(social_pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.4937 1.3063 1.1303 0.9384 0.9008 0.76948 0.60790
## Proportion of Variance 0.2789 0.2133 0.1597 0.1101 0.1014 0.07401 0.04619
## Cumulative Proportion  0.2789 0.4922 0.6519 0.7620 0.8634 0.93741 0.98360
##                           PC8
## Standard deviation     0.3622
## Proportion of Variance 0.0164
## Cumulative Proportion  1.0000
plot(social_pca)

#Biplot Visualization

biplot(fit.pc)

#I feel factor analysis is not beneficial for the social media data because I observed that we are missing the most part of the uniqueness of these apps by including factors and we are able to capture only a small portion of variances by using factors. #And parallel analysis screeplot indicated that the ideal number of factors for the social media data is zero. #From the component analysis we got similar results to PCA, where the apps like Instagram, whatsapp/wechat, LinkedIn, Youtube, Snapchat usages are a bit similar and high compared to OTT, Twitter and Reddit.

Multiple Regression

sm2 <- sm[, 2:10]
sm2
## # A tibble: 21 × 9
##    Instagram LinkedIn SnapChat Twitter `Whatsapp/Wechat` youtube   OTT Reddit
##        <dbl>    <dbl>    <dbl>   <dbl>             <dbl>   <dbl> <dbl>  <dbl>
##  1     3.5      4        1       5                  1       2.5   14.5    2.5
##  2     7.73     5.2      3.68    0                  4.18    4.25   0      0  
##  3     3.77     7        0.533   0                  9.83    1.85   2      0  
##  4     5.38     5.32     1.3     0                  5.3     2      2      0  
##  5    12        0.583    0       0.667              3       3.5    2      1  
##  6     2.33     7        0.467   0                 12       7      3      0  
##  7     5.37     4        0       0                  6       3      0      0  
##  8     7        4        3       0                 10       2      3      0  
##  9     8.65    10        3.83    0                  6.15    4      3      0  
## 10     0.167    0        0       0                  1       3      0      0  
## # ℹ 11 more rows
## # ℹ 1 more variable: `How you felt the entire week?` <dbl>

1.Model Development

# Performing multiple regression on the dataset
fit <- lm(sm2$`How you felt the entire week?` ~ sm2$Instagram+ sm2$LinkedIn + sm2$SnapChat + sm2$Twitter+ sm2$`Whatsapp/Wechat`+ sm2$youtube + sm2$OTT + sm2$Reddit , data=sm2)
#show the results
summary(fit)
## 
## Call:
## lm(formula = sm2$`How you felt the entire week?` ~ sm2$Instagram + 
##     sm2$LinkedIn + sm2$SnapChat + sm2$Twitter + sm2$`Whatsapp/Wechat` + 
##     sm2$youtube + sm2$OTT + sm2$Reddit, data = sm2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0693 -0.3702 -0.0812  0.4818  1.4647 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            2.68725    0.67260   3.995  0.00178 **
## sm2$Instagram         -0.03358    0.06371  -0.527  0.60778   
## sm2$LinkedIn           0.12267    0.08573   1.431  0.17799   
## sm2$SnapChat           0.04470    0.06202   0.721  0.48488   
## sm2$Twitter            0.18243    0.26988   0.676  0.51188   
## sm2$`Whatsapp/Wechat`  0.04077    0.06708   0.608  0.55467   
## sm2$youtube            0.06912    0.13459   0.514  0.61692   
## sm2$OTT               -0.08423    0.09193  -0.916  0.37757   
## sm2$Reddit            -0.02812    0.12209  -0.230  0.82174   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8115 on 12 degrees of freedom
## Multiple R-squared:  0.2784, Adjusted R-squared:  -0.2026 
## F-statistic: 0.5788 on 8 and 12 DF,  p-value: 0.7774

#From the above summary we got p-value 0.7774 which indicates the coefficient of the predictor variable associated with this p-value is not statistically significant.

coefficients(fit)
##           (Intercept)         sm2$Instagram          sm2$LinkedIn 
##            2.68724656           -0.03357685            0.12266943 
##          sm2$SnapChat           sm2$Twitter sm2$`Whatsapp/Wechat` 
##            0.04469741            0.18242992            0.04076703 
##           sm2$youtube               sm2$OTT            sm2$Reddit 
##            0.06911591           -0.08423071           -0.02811749

###From the above we get information about the dependent variable in equation form y=b0+ b1x1 + b2x2+…+bnxn where intercept b0=2.68, and cofficients b1=-0.033,….

fitted(fit)
##        1        2        3        4        5        6        7        8 
## 2.939170 3.694387 3.803577 3.402626 2.645129 4.208769 3.449678 3.370189 
##        9       10       11       12       13       14       15       16 
## 4.069330 2.929765 3.364427 3.609704 3.518215 3.308767 3.177730 3.535305 
##       17       18       19       20       21 
## 3.151638 2.908330 3.696886 3.135180 3.081197

Residual Analysis

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data=sm2, title="Social-Media")

plot(fit, which=1) # Residuals vs Fitted

plot(fit, which=2) # Normal Q-Q plot

residuals <- residuals(fit)
#Plot residuals against fitted values to check for homoscedasticity
plot_resid_fitted <- ggplot() +
  geom_point(aes(x = fitted(fit), y = residuals)) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Fitted Values", y = "Residuals",
       title = "Residuals vs Fitted Values Plot") +
  theme_minimal()
print(plot_resid_fitted)

#The residual vs. fitted plot is a tool used to evaluate the assumptions and adequacy of a regression model. It helps to identify whether the model adequately captures the underlying relationships in the data or if there are issues that need to be addressed. #The plot shows a pattern of points around zero, the model is likely not appropriate.

Prediction

predict.lm(fit, data.frame(Instagram=8, LinkedIn=5, SnapChat=4, Twitter=4,
    Whatsapp=4, youtube=8, OTT=3, Reddit=4 ))
## Warning: 'newdata' had 1 row but variables found have 21 rows
##        1        2        3        4        5        6        7        8 
## 2.939170 3.694387 3.803577 3.402626 2.645129 4.208769 3.449678 3.370189 
##        9       10       11       12       13       14       15       16 
## 4.069330 2.929765 3.364427 3.609704 3.518215 3.308767 3.177730 3.535305 
##       17       18       19       20       21 
## 3.151638 2.908330 3.696886 3.135180 3.081197

Model Accuracy

#Make predictions using the model
predicted <- predict(fit, newdata = sm2)
#Calculating RMSE by taking the square root of the mean of the squared differences between the actual values and the predicted values (predicted)
rmse <- sqrt(mean((sm2$`How you felt the entire week?` - predicted)^2))
rmse
## [1] 0.6134638

#Low RMSE(0.613) between 0 and 1 indicates that the models predictions are quite accurate, with small deviations from the actual values.

Some Visualizations

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
#Nonlinearity
# component + residual plot
crPlots(fit)

# plot studentized residuals vs. fitted values
library(car)
spreadLevelPlot(fit)

## 
## Suggested power transformation:  -3.296001